In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

import networkx as nx
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
px.set_mapbox_access_token(open("./mapbox_token").read())
# pip install plotly_express==0.4.0
from bs4 import BeautifulSoup

from warnings import filterwarnings
filterwarnings('ignore')

from datetime import datetime 

# Data Source
# https://www.kaggle.com/kimjihoo/coronavirusdataset?select=SeoulFloating.csv

from sklearn.model_selection import train_test_split

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

# Matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

# Diagram
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.externals.six import StringIO  
from IPython.display import Image

https://stackoverflow.com/questions/49647705/jupyter-nbextensions-does-not-appear/50663099#:~:text=First%2C%20follow%20this%20SO%2Dpost,and%20should%20see%20the%20crosshair.

Enable Table Of Content

  • pip install jupyter_contrib_nbextensions
  • jupyter contrib nbextension install --user
  • jupyter nbextension enable spellchecker/main
  • jupyter nbextension enable codefolding/main

Exploratory Data Analysis

Case

Overview of the cases in Korea

In [3]:
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataForMap = caseDataForMap[caseDataForMap[['latitude', 'longitude']] != '-']

cols = ['latitude', 'longitude']
caseData[cols] = caseData[cols].apply(pd.to_numeric, errors='coerce', axis=1)
display(caseData.head())

fig = px.scatter_mapbox(
    caseData[caseData.latitude != '-'], 
    text="<br>City: " + caseData["city"] +" <br>Province: " + caseData["province"],
    lat="latitude", 
    lon="longitude",     
    color="confirmed", 
    size="confirmed",
    color_continuous_scale=px.colors.sequential.Burg,
    size_max=100, 
    mapbox_style='dark',
    zoom=6,
    title="Overview of the cases in Korea")
fig.show()
case_id province city group infection_case confirmed latitude longitude
0 1000001 Seoul Yongsan-gu True Itaewon Clubs 139 37.538621 126.992652
1 1000002 Seoul Gwanak-gu True Richway 119 37.482080 126.901384
2 1000003 Seoul Guro-gu True Guro-gu Call Center 95 37.508163 126.884387
3 1000004 Seoul Yangcheon-gu True Yangcheon Table Tennis Club 43 37.546061 126.874209
4 1000005 Seoul Dobong-gu True Day Care Center 43 37.679422 127.044374

Check the relationship between group and solo cases

In [4]:
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataSorted = caseDataForMap.sort_values(by=['confirmed'], ascending=False)
display(caseDataSorted.head())
sortedValues = caseDataForMap.groupby(['province','group']).sum().sort_values(by=['confirmed'], ascending=False).reset_index()
sortedValues = sortedValues[['province','confirmed','group']]
display(sortedValues.head())

fig = px.bar(sortedValues, x='confirmed', y='province', color='group', barmode='group')
fig.update_layout(hovermode='y')
fig.show()
case_id province city group infection_case confirmed latitude longitude
48 1200001 Daegu Nam-gu True Shincheonji Church 4511 35.84008 128.5667
56 1200009 Daegu - False contact with patient 917 - -
57 1200010 Daegu - False etc 747 - -
145 6000001 Gyeongsangbuk-do from other city True Shincheonji Church 566 - -
109 2000020 Gyeonggi-do - False overseas inflow 305 - -
province confirmed group
0 Daegu 4975 True
1 Daegu 1705 False
2 Gyeongsangbuk-do 979 True
3 Seoul 720 True
4 Seoul 560 False

Source of infected cases sort by source and province

In [5]:
#Used log, because daegu has high amount of infection which highly skewed the data
# caseData['logConfirmed'] = np.log(caseData['confirmed'])
locations = caseData.pivot("province", "infection_case", "confirmed")
display(locations.head())

f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(locations, cmap="BrBG", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Numbers of cases sort by source and province')
infection_case Anyang Gunpo Pastors Group Biblical Language study meeting Bonghwa Pureun Nursing Home Bundang Jesaeng Hospital Changnyeong Coin Karaoke Cheongdo Daenam Hospital Coupang Logistics Center Daejeon door-to-door sales Daesil Convalescent Hospital Daezayeon Korea ... Yangcheon Table Tennis Club Yechun-gun Yeonana News Class Yeongdeungpo Learning Institute Yongin Brothers contact with patient etc gym facility in Cheonan gym facility in Sejong overseas inflow
province
Busan NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN 19.0 30.0 NaN NaN 36.0
Chungcheongbuk-do NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN 8.0 11.0 NaN NaN 13.0
Chungcheongnam-do NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN 2.0 12.0 103.0 NaN 16.0
Daegu NaN NaN NaN NaN NaN 2.0 NaN NaN 101.0 NaN ... NaN NaN NaN NaN NaN 917.0 747.0 NaN NaN 41.0
Daejeon NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN 15.0 15.0 NaN NaN 15.0

5 rows × 81 columns

Out[5]:
Text(0.5, 1, 'Numbers of cases sort by source and province')

Risk Level of each province

In [6]:
caseData = pd.read_csv('covid/case.csv')
caseDataForMap = caseData.copy()
caseDataForMap['logConfirmed'] = np.log(caseDataForMap['confirmed'])

fig = px.box(caseDataForMap, x="logConfirmed", y="province")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()
In [7]:
caseDataForMap = caseData.copy()
sortedValues = caseDataForMap.groupby(['province']).sum().sort_values(by=['confirmed'], ascending=False).reset_index()
sortedValues['logConfirmed'] = np.log(sortedValues['confirmed'])
sortedValues = sortedValues[['province','confirmed','logConfirmed']]
display(sortedValues.head())
province confirmed logConfirmed
0 Daegu 6680 8.806873
1 Gyeongsangbuk-do 1324 7.188413
2 Seoul 1280 7.154615
3 Gyeonggi-do 1000 6.907755
4 Incheon 202 5.308268
In [8]:
print('Mean: ', sortedValues.mean().values)
print('Standard Derivation: ', sortedValues.std().values)
Mean:  [670.29411765   4.96126521]
Standard Derivation:  [1610.8280776     1.65884891]

This amount of COVID-19 cases has a mean of 670 and a standard deviation of 1610.

Hence we sort the provinces into 4 different types of level:

  • Very Risk = 4 (Above 2000 cases)
  • High Risk = 3 (1000 - 2000 cases)
  • Medium Risk = 2 (100 - 999 cases)
  • Low Risk = 1 ( Below 99 cases)
In [9]:
def calculate_risk_level(sortedValues) :
    if sortedValues['confirmed'] >= 2000:
        return 4
    elif sortedValues['confirmed'] >= 1000 and sortedValues['confirmed'] < 2000:
        return 3
    elif sortedValues['confirmed'] >= 100 and sortedValues['confirmed'] < 900:
        return 2
    else:
        return 1
    
sortedValues['Risk Level'] = sortedValues.apply(calculate_risk_level,axis=1);
sortedValues = sortedValues[['province','Risk Level']]
display(sortedValues.head())
province Risk Level
0 Daegu 4
1 Gyeongsangbuk-do 3
2 Seoul 3
3 Gyeonggi-do 3
4 Incheon 2

Patient Info

Numbers Of Cases Per Day

In [10]:
patientData = pd.read_csv('covid/patientinfo.csv')
display(patientData.head())

groupPatientData = patientData.groupby('confirmed_date').size().reset_index()
groupPatientData.columns = ['confirmed_date', 'count']

fig = px.line(groupPatientData, x="confirmed_date", y="count", title='Numbers Of Covid Cases Per Day')
fig.show()
patient_id sex age country province city infection_case infected_by contact_number symptom_onset_date confirmed_date released_date deceased_date state
0 1000000001 male 50s Korea Seoul Gangseo-gu overseas inflow NaN 75 2020-01-22 2020-01-23 2020-02-05 NaN released
1 1000000002 male 30s Korea Seoul Jungnang-gu overseas inflow NaN 31 NaN 2020-01-30 2020-03-02 NaN released
2 1000000003 male 50s Korea Seoul Jongno-gu contact with patient 2002000001 17 NaN 2020-01-30 2020-02-19 NaN released
3 1000000004 male 20s Korea Seoul Mapo-gu overseas inflow NaN 9 2020-01-26 2020-01-30 2020-02-15 NaN released
4 1000000005 female 20s Korea Seoul Seongbuk-gu contact with patient 1000000002 2 NaN 2020-01-31 2020-02-24 NaN released

Average Survivor Treatment Day (Recovery Speed), By Age and Gender

In [11]:
confinedDaysAnalysis = patientData.copy()
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'] != '100s'] #Just a single data which is highly skewed
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['released_date'].notnull()]

cols = ['released_date', 'confirmed_date']
confinedDaysAnalysis[cols] = confinedDaysAnalysis[cols].apply(pd.to_datetime, errors='coerce', axis=1)
confinedDaysAnalysis['Total Treatment Days'] = (confinedDaysAnalysis['released_date'] - confinedDaysAnalysis['confirmed_date']).dt.days

groupedData = confinedDaysAnalysis.groupby(['sex', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["sex", "age", "Average Treatment Days"]

dataForHeatmap = groupedData.pivot("sex", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Survivor Treatment Day, Sort by Age & Gender')
age 0s 10s 20s 30s 40s 50s 60s 70s 80s 90s
sex
female 28.375000 19.80 23.659091 23.415094 22.025806 23.310345 26.543689 31.344828 33.738095 28.461538
male 21.727273 21.45 23.275132 22.971429 26.581395 24.666667 27.257143 36.200000 37.055556 33.000000
Out[11]:
Text(0.5, 1, 'Average Survivor Treatment Day, Sort by Age & Gender')
In [12]:
confinedDaysAnalysis['age'] = confinedDaysAnalysis['age'].dropna()
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
confinedDaysAnalysis['age'] = confinedDaysAnalysis['age'].map(age)
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'].notna()]

fig = px.scatter(confinedDaysAnalysis, x="age", y="Total Treatment Days", color="sex", trendline="ols", title="Does age group affect total treatment days?")
fig.show()
Average Survivor Treatment Day (Recovery Speed) By City & Age
In [13]:
groupedData = confinedDaysAnalysis.groupby(['province', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["province", "age", "Average Treatment Days"]
groupedData['Average Treatment Days'] = np.ceil(groupedData['Average Treatment Days'].apply(pd.to_numeric, errors='coerce'))

dataForHeatmap = groupedData.pivot("province", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Survivor Treatment Day, Sort by City & Age')
age 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0
province
Chungcheongbuk-do 49.0 20.0 24.0 23.0 23.0 25.0 23.0 26.0 26.0 20.0
Chungcheongnam-do 17.0 23.0 23.0 26.0 24.0 27.0 27.0 43.0 9.0 NaN
Daegu NaN NaN NaN NaN NaN 9.0 7.0 12.0 NaN NaN
Daejeon NaN 15.0 22.0 23.0 18.0 26.0 40.0 24.0 17.0 NaN
Gangwon-do NaN NaN 12.0 24.0 27.0 19.0 25.0 NaN NaN NaN
Out[13]:
Text(0.5, 1, 'Average Survivor Treatment Day, Sort by City & Age')

Average Deceased Treatment Day (Fatality Speed) By Gender and Age

In [14]:
confinedDaysAnalysis = patientData.copy()
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['age'] != '100s'] #Just a single data which is highly skewed
confinedDaysAnalysis = confinedDaysAnalysis[confinedDaysAnalysis['deceased_date'].notnull()]

cols = ['deceased_date', 'confirmed_date']
confinedDaysAnalysis[cols] = confinedDaysAnalysis[cols].apply(pd.to_datetime, errors='coerce', axis=1)
confinedDaysAnalysis['Total Treatment Days'] = (confinedDaysAnalysis['deceased_date'] - confinedDaysAnalysis['confirmed_date']).dt.days
display(confinedDaysAnalysis.head())

groupedData = confinedDaysAnalysis.groupby(['sex', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["sex", "age", "Average Treatment Days"]

dataForHeatmap = groupedData.pivot("sex", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(15, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Deceased Treatment Day, Sort by Age & Gender')
patient_id sex age country province city infection_case infected_by contact_number symptom_onset_date confirmed_date released_date deceased_date state Total Treatment Days
1468 1200000038 female 50s Korea Daegu Nam-gu NaN NaN NaN NaN 2020-02-18 NaN 2020-02-23 deceased 5
1507 1200000114 male 70s Korea Daegu NaN Shincheonji Church NaN NaN NaN 2020-02-21 NaN 2020-02-26 deceased 5
1508 1200000620 male 70s Korea Daegu NaN NaN NaN NaN NaN 2020-02-24 NaN 2020-03-02 deceased 7
1509 1200000901 female 80s Korea Daegu NaN NaN NaN NaN NaN 2020-02-25 NaN 2020-03-04 deceased 8
1510 1200001064 female 70s Korea Daegu NaN NaN NaN NaN NaN 2020-02-26 NaN 2020-03-01 deceased 4
age 30s 50s 60s 70s 80s 90s
sex
female NaN 1.333333 8.0 27.0 11.615385 10.0
male 0.0 8.000000 10.9 5.5 10.666667 10.5
Out[14]:
Text(0.5, 1, 'Average Deceased Treatment Day, Sort by Age & Gender')

Average Deceased Treatment Day (Fatality Speed) By City and Age

In [15]:
groupedData = confinedDaysAnalysis.groupby(['province', 'age'])['Total Treatment Days'].mean().unstack().stack().reset_index()
groupedData.columns = ["province", "age", "Average Treatment Days"]
groupedData['Average Treatment Days'] = np.ceil(groupedData['Average Treatment Days'].apply(pd.to_numeric, errors='coerce'))

dataForHeatmap = groupedData.pivot("province", "age", "Average Treatment Days")
display(dataForHeatmap.head())
f, ax = plt.subplots(figsize=(20, 5))
sns.heatmap(dataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Average Deceased Treatment Day, Sort by City & Age')
age 30s 50s 60s 70s 80s 90s
province
Daegu NaN 3.0 6.0 4.0 5.0 6.0
Daejeon NaN NaN NaN 52.0 NaN NaN
Gangwon-do NaN NaN NaN 15.0 24.0 NaN
Gyeonggi-do 0.0 NaN NaN NaN NaN NaN
Gyeongsangbuk-do NaN 7.0 11.0 17.0 13.0 12.0
Out[15]:
Text(0.5, 1, 'Average Deceased Treatment Day, Sort by City & Age')

Number of Infection, Survivor, Deceased Per City by percentage

In [16]:
survivorCountAnalysis = patientData.copy()

survivorCountAnalysis['survive'] = survivorCountAnalysis['released_date'].notnull()
survivorCountAnalysis['deceased'] = survivorCountAnalysis['deceased_date'].notnull()
survivorCountAnalysis['under treatment'] = survivorCountAnalysis['deceased_date'].isnull() & survivorCountAnalysis['released_date'].isnull()

provinceStats = survivorCountAnalysis.groupby(['province']).sum()
provinceStatsClean = provinceStats[['survive', 'deceased', 'under treatment']]

provinceStatsClean['survive %'] = np.round(provinceStatsClean['survive'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['deceased %'] = np.round(provinceStatsClean['deceased'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['under treatment %'] = np.round(provinceStatsClean['under treatment'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)

provinceStatsAbsolute = provinceStatsClean[['survive', 'deceased', 'under treatment']]
provinceStatsPercentage = provinceStatsClean[['survive %', 'deceased %', 'under treatment %']]

display(provinceStatsAbsolute.head())
display(provinceStatsPercentage.head())
survive deceased under treatment
province
Busan 0.0 0.0 151.0
Chungcheongbuk-do 50.0 0.0 6.0
Chungcheongnam-do 150.0 0.0 18.0
Daegu 4.0 20.0 113.0
Daejeon 44.0 1.0 74.0
survive % deceased % under treatment %
province
Busan 0.00 0.00 100.00
Chungcheongbuk-do 89.29 0.00 10.71
Chungcheongnam-do 89.29 0.00 10.71
Daegu 2.92 14.60 82.48
Daejeon 36.97 0.84 62.18
In [17]:
newTable = provinceStatsPercentage.reset_index()
display(newTable.head())
newTable = pd.melt(newTable, id_vars="province", var_name="stats", value_name="rate")

fig = px.bar(newTable, x='rate', y='province', color='stats', barmode='group')
fig.update_layout(hovermode='y')
fig.show()
province survive % deceased % under treatment %
0 Busan 0.00 0.00 100.00
1 Chungcheongbuk-do 89.29 0.00 10.71
2 Chungcheongnam-do 89.29 0.00 10.71
3 Daegu 2.92 14.60 82.48
4 Daejeon 36.97 0.84 62.18

Number of Infection, Survivor, Deceased Sort By Gender

In [18]:
survivorCountAnalysis = patientData.copy()

survivorCountAnalysis['survive'] = survivorCountAnalysis['released_date'].notnull()
survivorCountAnalysis['deceased'] = survivorCountAnalysis['deceased_date'].notnull()
survivorCountAnalysis['under treatment'] = survivorCountAnalysis['deceased_date'].isnull() & survivorCountAnalysis['released_date'].isnull()

provinceStats = survivorCountAnalysis.groupby(['sex']).sum()
provinceStatsClean = provinceStats[['survive', 'deceased', 'under treatment']]

provinceStatsClean['survive %'] = np.round(provinceStatsClean['survive'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['deceased %'] = np.round(provinceStatsClean['deceased'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)
provinceStatsClean['under treatment %'] = np.round(provinceStatsClean['under treatment'] / (provinceStatsClean['survive'] + provinceStatsClean['deceased'] + provinceStatsClean['under treatment']) * 100, 2)

provinceStatsAbsolute = provinceStatsClean[['survive', 'deceased', 'under treatment']]
provinceStatsPercentage = provinceStatsClean[['survive %', 'deceased %', 'under treatment %']]

display(provinceStatsAbsolute)
display(provinceStatsPercentage)
survive deceased under treatment
sex
female 909.0 26.0 1285.0
male 677.0 40.0 1108.0
survive % deceased % under treatment %
sex
female 40.95 1.17 57.88
male 37.10 2.19 60.71
In [19]:
newTable = provinceStatsPercentage.reset_index()
newTable = pd.melt(newTable, id_vars="sex", var_name="stats", value_name="rate")
display(newTable.head())

fig = px.bar(newTable, x='rate', y='sex', color='stats', barmode='group')
fig.update_layout(hovermode='y')
fig.show()
sex stats rate
0 female survive % 40.95
1 male survive % 37.10
2 female deceased % 1.17
3 male deceased % 2.19
4 female under treatment % 57.88

Network Diagram

In [20]:
networkData = patientData.copy()
networkData = networkData[networkData['infected_by'].notnull()]
networkData = networkData[['patient_id','sex','age','province','city','infection_case','infected_by','state']]
display(networkData.head())
patient_id sex age province city infection_case infected_by state
2 1000000003 male 50s Seoul Jongno-gu contact with patient 2002000001 released
4 1000000005 female 20s Seoul Seongbuk-gu contact with patient 1000000002 released
5 1000000006 female 50s Seoul Jongno-gu contact with patient 1000000003 released
6 1000000007 male 20s Seoul Jongno-gu contact with patient 1000000003 released
9 1000000010 female 60s Seoul Seongbuk-gu contact with patient 1000000003 released
In [21]:
A = list(networkData["infected_by"].unique())
B = list(networkData["patient_id"].unique())
node_list = set(A+B)

# Create Graph
G = nx.Graph()

for i in node_list:
    G.add_node(i)
# G.nodes()

for i,j in networkData.iterrows():
    G.add_edges_from([(j["infected_by"],j["patient_id"])])
    
pos = nx.spring_layout(G, k=0.5, iterations=50)

for n, p in pos.items():
    G.nodes[n]['pos'] = p

edge_trace = go.Scatter(
    x=[],
    y=[],
    line=dict(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])

node_trace = go.Scatter(
    x=[],
    y=[],
    text=[],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='RdPu_r',
        reversescale=True,
        color=[],
        size=15,
        colorbar=dict(
            thickness=10,
            title='Numnber of Infected Cases',
            xanchor='left',
            titleside='right'
        ),
        line=dict(width=0)))

for node in G.nodes():
    x, y = G.nodes[node]['pos']
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])

for node, adjacencies in enumerate(G.adjacency()):
    node_trace['marker']['color']+=tuple([len(adjacencies[1])])
    node_info = str(adjacencies[0]) +' # of connections: '+ str(len(adjacencies[1]))
    node_trace['text']+=tuple([node_info])
    
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='<br>Korea Covid Network Connections',
                titlefont=dict(size=16),
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                annotations=[ dict(
                    text="",
                    showarrow=False,
                    xref="paper", yref="paper") ],
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

iplot(fig)

Policy

In [22]:
policyData = pd.read_csv('covid/policy.csv')
policyDataCopy = policyData.copy()
policyDataCopy['end_date'] = policyDataCopy['end_date'].fillna(datetime.now().strftime('%Y-%m-%d'))
display(policyDataCopy.head())

df = []

for index, row in policyDataCopy.iterrows():
    df.append(dict(Task=row['gov_policy'], Start=row['start_date'], Finish=row['end_date'], Resource=row['type']))

fig = px.timeline(df, x_start="Start", x_end="Finish", y="Task", color="Resource", title="Policy over time")
fig.show()

patientData = pd.read_csv('covid/patientinfo.csv')
groupPatientData = patientData.groupby('confirmed_date').size().reset_index()
groupPatientData.columns = ['confirmed_date', 'count']

fig = px.line(groupPatientData, x="confirmed_date", y="count", title='Numbers Of Covid Cases Per Day')
fig.show()
policy_id country type gov_policy detail start_date end_date
0 1 Korea Alert Infectious Disease Alert Level Level 1 (Blue) 2020-01-03 2020-01-19
1 2 Korea Alert Infectious Disease Alert Level Level 2 (Yellow) 2020-01-20 2020-01-27
2 3 Korea Alert Infectious Disease Alert Level Level 3 (Orange) 2020-01-28 2020-02-22
3 4 Korea Alert Infectious Disease Alert Level Level 4 (Red) 2020-02-23 2020-09-15
4 5 Korea Immigration Special Immigration Procedure from China 2020-02-04 2020-09-15

Time

In [23]:
timeData = pd.read_csv('covid/time.csv')
timeDataMelted = pd.melt(timeData, id_vars=['date'], value_vars=['test', 'negative','confirmed','released','deceased'])
display(timeDataMelted.head())

fig = px.line(timeDataMelted, x="date", y="value", color='variable', title="Overall cases over time")
fig.show()
date variable value
0 2020-01-20 test 1
1 2020-01-21 test 1
2 2020-01-22 test 4
3 2020-01-23 test 22
4 2020-01-24 test 27
In [24]:
timeDataMelted = pd.melt(timeData, id_vars=['date'], value_vars=['confirmed','released','deceased'])
display(timeDataMelted.head())

fig = px.line(timeDataMelted, x="date", y="value", color='variable', title="Confirmed, released & deceased over time")
fig.show()
date variable value
0 2020-01-20 confirmed 1
1 2020-01-21 confirmed 1
2 2020-01-22 confirmed 1
3 2020-01-23 confirmed 1
4 2020-01-24 confirmed 2
In [25]:
miniTimeData = timeData[['date','test','negative']]
miniTimeData['Test increase'] =  miniTimeData['test'] - miniTimeData['test'].shift(1) 
miniTimeData['Negative increase'] =  miniTimeData['negative'] - miniTimeData['negative'].shift(1) 
miniTimeData['Month'] = pd.to_datetime(miniTimeData['date']).dt.month
miniTimeData['Day'] = pd.to_datetime(miniTimeData['date']).dt.day_name()
display(miniTimeData.head())
date test negative Test increase Negative increase Month Day
0 2020-01-20 1 0 NaN NaN 1 Monday
1 2020-01-21 1 0 0.0 0.0 1 Tuesday
2 2020-01-22 4 3 3.0 3.0 1 Wednesday
3 2020-01-23 22 21 18.0 18.0 1 Thursday
4 2020-01-24 27 25 5.0 4.0 1 Friday
In [26]:
miniTimeDataDay = miniTimeData[['Test increase', 'Negative increase', 'Day']]
miniTimeDataDay = miniTimeDataDay.groupby('Day')['Test increase', 'Negative increase'].mean()
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
miniTimeDataDay = miniTimeDataDay.reindex(cats) 
miniTimeDataDay = miniTimeDataDay.reset_index()
miniTimeDataDayMelted = pd.melt(miniTimeDataDay, id_vars=['Day'], value_vars=['Test increase','Negative increase'])
display(miniTimeDataDayMelted.head())

fig = px.bar(miniTimeDataDayMelted, x='Day', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
Day variable value
0 Monday Test increase 4852.565217
1 Tuesday Test increase 9753.958333
2 Wednesday Test increase 8594.391304
3 Thursday Test increase 8632.000000
4 Friday Test increase 9373.869565
In [27]:
miniTimeDataMonth = miniTimeData[['Test increase', 'Negative increase', 'Month']]
miniTimeDataMonth = miniTimeDataMonth.groupby('Month')['Test increase', 'Negative increase'].mean()
miniTimeDataMonth = miniTimeDataMonth.reset_index()
miniTimeDataMonthMelted = pd.melt(miniTimeDataMonth, id_vars=['Month'], value_vars=['Test increase','Negative increase'])
display(miniTimeDataMonthMelted.head())

fig = px.bar(miniTimeDataMonthMelted, x='Month', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
Month variable value
0 1 Test increase 28.272727
1 2 Test increase 3232.517241
2 3 Test increase 10209.967742
3 4 Test increase 6977.233333
4 5 Test increase 9385.193548
In [28]:
miniTimeData = timeData[['date','confirmed','released','deceased']]
miniTimeData['confirmed increase'] =  miniTimeData['confirmed'] - miniTimeData['confirmed'].shift(1) 
miniTimeData['released increase'] =  miniTimeData['released'] - miniTimeData['released'].shift(1) 
miniTimeData['deceased increase'] =  miniTimeData['deceased'] - miniTimeData['deceased'].shift(1) 
miniTimeData['Month'] = pd.to_datetime(miniTimeData['date']).dt.month
miniTimeData['Day'] = pd.to_datetime(miniTimeData['date']).dt.day_name()
display(miniTimeData.head())
date confirmed released deceased confirmed increase released increase deceased increase Month Day
0 2020-01-20 1 0 0 NaN NaN NaN 1 Monday
1 2020-01-21 1 0 0 0.0 0.0 0.0 1 Tuesday
2 2020-01-22 1 0 0 0.0 0.0 0.0 1 Wednesday
3 2020-01-23 1 0 0 0.0 0.0 0.0 1 Thursday
4 2020-01-24 2 0 0 1.0 0.0 0.0 1 Friday
In [29]:
miniTimeDataDay = miniTimeData[['confirmed increase', 'released increase','deceased increase', 'Day']]
miniTimeDataDay = miniTimeDataDay.groupby('Day')['confirmed increase', 'released increase','deceased increase'].mean()
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
miniTimeDataDay = miniTimeDataDay.reindex(cats) 
miniTimeDataDay = miniTimeDataDay.reset_index()
miniTimeDataDayMelted = pd.melt(miniTimeDataDay, id_vars=['Day'], value_vars=['confirmed increase', 'released increase','deceased increase'])
display(miniTimeDataDayMelted.head())

fig = px.bar(miniTimeDataDayMelted, x='Day', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
Day variable value
0 Monday confirmed increase 67.217391
1 Tuesday confirmed increase 63.375000
2 Wednesday confirmed increase 76.130435
3 Thursday confirmed increase 79.565217
4 Friday confirmed increase 84.086957
In [30]:
miniTimeDataMonth = miniTimeData[['confirmed increase', 'released increase','deceased increase', 'Month']]
miniTimeDataMonth = miniTimeDataMonth.groupby('Month')['confirmed increase', 'released increase','deceased increase'].mean()
miniTimeDataMonth = miniTimeDataMonth.reset_index()
miniTimeDataMonthMelted = pd.melt(miniTimeDataMonth, id_vars=['Month'], value_vars=['confirmed increase', 'released increase','deceased increase'])
display(miniTimeDataMonthMelted.head())

fig = px.bar(miniTimeDataMonthMelted, x='Month', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
Month variable value
0 1 confirmed increase 0.909091
1 2 confirmed increase 108.241379
2 3 confirmed increase 214.064516
3 4 confirmed increase 32.633333
4 5 confirmed increase 22.677419

TimeAge

In [31]:
timeAgeData = pd.read_csv('covid/timeage.csv')
display(timeAgeData.head())

fig = px.line(timeAgeData, x="date", y="confirmed", color='age', title="Confirmed cases of various age group over time")
fig.show()
date time age confirmed deceased
0 2020-03-02 0 0s 32 0
1 2020-03-02 0 10s 169 0
2 2020-03-02 0 20s 1235 0
3 2020-03-02 0 30s 506 1
4 2020-03-02 0 40s 633 1
In [32]:
timeAgeData = pd.read_csv('covid/timeage.csv')
fig = px.line(timeAgeData, x="date", y="deceased", color='age', title="Deceased cases of various age group over time")
fig.show()

Time Gender

In [33]:
timeGender = pd.read_csv('covid/timegender.csv')
display(timeGender.head())

fig = px.line(timeGender, x="date", y="confirmed", color='sex', title="Confirmed cases between genders over time")
fig.show()
date time sex confirmed deceased
0 2020-03-02 0 male 1591 13
1 2020-03-02 0 female 2621 9
2 2020-03-03 0 male 1810 16
3 2020-03-03 0 female 3002 12
4 2020-03-04 0 male 1996 20
In [34]:
fig = px.line(timeGender, x="date", y="deceased", color='sex', title="Deceased cases between genders over time")
fig.show()
In [35]:
timeGender = pd.read_csv('covid/timegender.csv')
timeGenderAnalysis = timeGender.copy()

timeGenderAnalysis['confirmed increase'] = timeGenderAnalysis['confirmed'] - timeGenderAnalysis['confirmed'].shift(2)
timeGenderAnalysis['deceased increase'] = timeGenderAnalysis['deceased'] - timeGenderAnalysis['deceased'].shift(2)

timeGenderAnalysis['month'] = pd.to_datetime(timeGenderAnalysis['date']).dt.month
timeGenderAnalysis['day'] = pd.to_datetime(timeGenderAnalysis['date']).dt.day_name()

timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['month','sex'])['confirmed increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()

timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['month','sex'], value_vars=['confirmed increase'])
display(timeGenderAnalysisMonthMelted.head())

fig = px.bar(timeGenderAnalysisMonthMelted, x='month', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
month sex variable value
0 3 female confirmed increase 112.413793
1 3 male confirmed increase 79.793103
2 4 female confirmed increase 17.733333
3 4 male confirmed increase 14.900000
4 5 female confirmed increase 8.387097
In [36]:
timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['day','sex'])['confirmed increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()

weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
timeGenderAnalysisMonth['day'] = pd.Categorical(timeGenderAnalysisMonth['day'],categories=weekdays)
timeGenderAnalysisMonth = timeGenderAnalysisMonth.sort_values('day')

timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['day','sex'], value_vars=['confirmed increase'])
display(timeGenderAnalysisMonthMelted.head())

fig = px.bar(timeGenderAnalysisMonthMelted, x='day', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
day sex variable value
0 Monday female confirmed increase 25.352941
1 Monday male confirmed increase 21.764706
2 Tuesday female confirmed increase 41.500000
3 Tuesday male confirmed increase 34.833333
4 Wednesday female confirmed increase 48.941176
In [37]:
timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['month','sex'])['deceased increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()

timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['month','sex'], value_vars=['deceased increase'])
display(timeGenderAnalysisMonthMelted.head())

fig = px.bar(timeGenderAnalysisMonthMelted, x='month', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
month sex variable value
0 3 female deceased increase 2.448276
1 3 male deceased increase 2.379310
2 4 female deceased increase 1.233333
3 4 male deceased increase 1.600000
4 5 female deceased increase 0.322581
In [38]:
timeGenderAnalysisMonth = timeGenderAnalysis.groupby(['day','sex'])['deceased increase'].mean()
timeGenderAnalysisMonth = timeGenderAnalysisMonth.reset_index()

weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
timeGenderAnalysisMonth['day'] = pd.Categorical(timeGenderAnalysisMonth['day'],categories=weekdays)
timeGenderAnalysisMonth = timeGenderAnalysisMonth.sort_values('day')

timeGenderAnalysisMonthMelted = pd.melt(timeGenderAnalysisMonth, id_vars=['day','sex'], value_vars=['deceased increase'])
display(timeGenderAnalysisMonthMelted.head())

fig = px.bar(timeGenderAnalysisMonthMelted, x='day', y='value', color='sex', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
day sex variable value
0 Monday female deceased increase 0.882353
1 Monday male deceased increase 0.823529
2 Tuesday female deceased increase 1.666667
3 Tuesday male deceased increase 1.166667
4 Wednesday female deceased increase 0.882353

Time Province

In [39]:
timeProvince = pd.read_csv('covid/timeprovince.csv')
display(timeProvince.head())

fig = px.line(timeProvince, x="date", y="confirmed", color='province', title="Confirmed cases of various province over time")
fig.show()
date time province confirmed released deceased
0 2020-01-20 16 Seoul 0 0 0
1 2020-01-20 16 Busan 0 0 0
2 2020-01-20 16 Daegu 0 0 0
3 2020-01-20 16 Incheon 1 0 0
4 2020-01-20 16 Gwangju 0 0 0
In [40]:
fig = px.line(timeProvince, x="date", y="released", color='province', title="Released cases of various province over time")
fig.show()
In [41]:
fig = px.line(timeProvince, x="date", y="deceased", color='province', title="Deceased cases of various province over time")
fig.show()

In depth Analysis Of Daegu Cases using Technical Analysis.

In [42]:
dataForDaegu = timeProvince[timeProvince['province'] == 'Daegu']
dataForDaegu = dataForDaegu[['date','province','confirmed']]
dataForDaegu['increasePerDay'] = dataForDaegu['confirmed'] - dataForDaegu['confirmed'].shift(1)
dataForDaegu = dataForDaegu[['date','province','increasePerDay']]
dataForDaegu = dataForDaegu.rename({'increasePerDay':'Increase Per Day'}, axis=1)
display(dataForDaegu.head())
fig = px.line(dataForDaegu, x="date", y="Increase Per Day", title="Daegu Cases Per Day")
fig.show()
date province Increase Per Day
2 2020-01-20 Daegu NaN
19 2020-01-21 Daegu 0.0
36 2020-01-22 Daegu 0.0
53 2020-01-23 Daegu 0.0
70 2020-01-24 Daegu 0.0

Daeugu's Cases is most active during Feb to Apr period.

Technical Analysis

In [43]:
# Uncommend to focus data on active period
# dataForDaegu = dataForDaegu.set_index('date')
# dataForDaegu = dataForDaegu['2020-02-01' : '2020-04-31'].reset_index()
dataForDaegu['10 Days Moving Average'] = dataForDaegu['Increase Per Day'].rolling(10).mean()
dataForDaegu['Upper Limit 10 day Bollinger Band']  = dataForDaegu['Increase Per Day'].rolling(10).mean() + (dataForDaegu['Increase Per Day'].rolling(10).std() * 2)
dataForDaegu['Lower Limit 10 day Bollinger Band']  = dataForDaegu['Increase Per Day'].rolling(10).mean() - (dataForDaegu['Increase Per Day'].rolling(10).std() * 2)
dataForDaeguNew = pd.melt(dataForDaegu, id_vars=['date'], value_vars=['Increase Per Day', '10 Days Moving Average','Upper Limit 10 day Bollinger Band','Lower Limit 10 day Bollinger Band'])
display(dataForDaeguNew.head())
fig = px.line(dataForDaeguNew, x="date", y="value", color='variable', title="Daegu Cases Per Day")
fig['data'][2]['line']['color']="rgba(147,212,219,0.8)"
fig['data'][3]['line']['color']="rgba(157,212,219,0.8)"
fig.update_layout(hovermode='x')
fig.show()
date variable value
0 2020-01-20 Increase Per Day NaN
1 2020-01-21 Increase Per Day 0.0
2 2020-01-22 Increase Per Day 0.0
3 2020-01-23 Increase Per Day 0.0
4 2020-01-24 Increase Per Day 0.0

Stats By Month

In [44]:
dataForDaeguNew['date'] = pd.to_datetime(dataForDaeguNew['date'])
dataForDaeguNew['Month'] = dataForDaeguNew['date'].dt.month 
dataForDaeguNew['Day'] = dataForDaeguNew['date'].dt.day_name()
dataForDaeguMonth = dataForDaeguNew[['variable','Month']]
dataForDaeguMonth = dataForDaeguNew.groupby(['Month','variable'])['value'].mean()
dataForDaeguMonth = dataForDaeguMonth.reset_index()
display(dataForDaeguMonth.head())

fig = px.bar(dataForDaeguMonth, x='Month', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
Month variable value
0 1 10 Days Moving Average 0.000000
1 1 Increase Per Day 0.000000
2 1 Lower Limit 10 day Bollinger Band 0.000000
3 1 Upper Limit 10 day Bollinger Band 0.000000
4 2 10 Days Moving Average 25.265517

Stats By Day

In [45]:
dataForDaeguMonth = dataForDaeguNew[['variable','Day']]
dataForDaeguMonth = dataForDaeguNew.groupby(['Day','variable'])['value'].mean()
dataForDaeguMonth = dataForDaeguMonth.reset_index()


weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
dataForDaeguMonth['Day'] = pd.Categorical(dataForDaeguMonth['Day'],categories=weekdays)
dataForDaeguMonth = dataForDaeguMonth.sort_values('Day')

display(dataForDaeguMonth.head())

fig = px.bar(dataForDaeguMonth, x='Day', y='value', color='variable', barmode='group')
fig.update_layout(hovermode='x')
fig.show()
Day variable value
4 Monday 10 Days Moving Average 46.068182
5 Monday Increase Per Day 36.434783
6 Monday Lower Limit 10 day Bollinger Band -8.111102
7 Monday Upper Limit 10 day Bollinger Band 100.247465
23 Tuesday Upper Limit 10 day Bollinger Band 92.607688

Time Series World Map Visualisation of COVID-19 cases in korea¶

In [46]:
timeProvince = pd.read_csv('covid/timeprovince.csv')

data = {'province': ['Seoul', 'Busan', 'Daegu', 'Incheon', 'Gwangju', 'Daejeon',
       'Ulsan', 'Sejong', 'Gyeonggi-do', 'Gangwon-do',
       'Chungcheongbuk-do', 'Chungcheongnam-do', 'Jeollabuk-do',
       'Jeollanam-do', 'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Jeju-do'], 
        'longitude': [127.047325,129.066666,128.600006,126.705208,126.916664,127.385002,
                     129.316666,127.2822,127.143738,127.920158,
                     127.935905,126.979874,126.916664,126.9910,
                      129.263885,128.429581,126.5312],
        'latitude': [37.517235,35.166668,35.866669,37.456257,35.166668,36.351002,
                    35.549999,36.4870,37.603405,37.342220,
                    36.981304,36.806702,35.166668,34.8679,
                     35.835354,34.855228,33.4996]}
location = pd.DataFrame(data=data)

mergedData = timeProvince.merge(location, on='province', how='left')
mergedData = mergedData.dropna()
mergedData.head()
Out[46]:
date time province confirmed released deceased longitude latitude
0 2020-01-20 16 Seoul 0 0 0 127.047325 37.517235
1 2020-01-20 16 Busan 0 0 0 129.066666 35.166668
2 2020-01-20 16 Daegu 0 0 0 128.600006 35.866669
3 2020-01-20 16 Incheon 1 0 0 126.705208 37.456257
4 2020-01-20 16 Gwangju 0 0 0 126.916664 35.166668

With Animation

In [47]:
fig = px.scatter_mapbox(
    mergedData, lat="latitude", lon="longitude",
    size="confirmed", size_max=100,
    color="deceased", color_continuous_scale=px.colors.sequential.Burg,
    hover_name="province",           
    mapbox_style='dark', zoom=6,
    animation_frame="date", animation_group="province"
)

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 200
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 200
fig.layout.coloraxis.showscale = False
fig.layout.sliders[0].pad.t = 10
fig.layout.updatemenus[0].pad.t= 10

fig.show()

Without animation

In [48]:
fig = px.scatter_mapbox(
    mergedData, lat="latitude", lon="longitude",
    size="confirmed", size_max=100,
    color="deceased", color_continuous_scale=px.colors.sequential.Burg,
    hover_name="province",           
    mapbox_style='dark', zoom=6,
)

fig.show()

Weather

In [49]:
weatherData = pd.read_csv('covid/weather.csv')
weatherData = weatherData.set_index('date')
weatherData = weatherData['2020-01-01' : '2020-08-31'].reset_index()
display(weatherData.head())

weatherDataForHeatmap = weatherData.pivot("date", "province", "avg_temp")
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(weatherDataForHeatmap, cmap="RdPu", annot=False, fmt="d", linewidths=.5, ax=ax).set_title('Weather Over Time')
date code province avg_temp min_temp max_temp precipitation max_wind_speed most_wind_direction avg_relative_humidity
0 2020-01-01 10000 Seoul -2.2 -6.5 0.3 0.0 2.6 50.0 64.4
1 2020-01-01 11000 Busan 1.9 -3.2 7.8 0.0 5.1 340.0 44.0
2 2020-01-01 12000 Daegu 0.2 -4.9 4.6 0.0 5.6 270.0 53.3
3 2020-01-01 13000 Gwangju -0.3 -4.9 5.7 0.0 4.3 50.0 58.0
4 2020-01-01 14000 Incheon -1.4 -5.4 1.9 0.0 3.8 160.0 66.6
Out[49]:
Text(0.5, 1, 'Weather Over Time')

Region

In [50]:
regionData = pd.read_csv('covid/region.csv')
pd.set_option('display.max_rows', regionData.shape[0]+1)
display(regionData.head())

fig = px.scatter_mapbox(
    regionData[regionData.city != 'Korea'], 
    text="city",
    lat="latitude", 
    lon="longitude",     
    color="elderly_population_ratio", 
    size="nursing_home_count",
    color_continuous_scale=px.colors.sequential.Burg, 
    size_max=100, 
    zoom=6,
    title="Number of nusing home and elder population ratio across Korea")
fig.show()
code province city latitude longitude elementary_school_count kindergarten_count university_count academy_ratio elderly_population_ratio elderly_alone_ratio nursing_home_count
0 10000 Seoul Seoul 37.566953 126.977977 607 830 48 1.44 15.38 5.8 22739
1 10010 Seoul Gangnam-gu 37.518421 127.047222 33 38 0 4.18 13.17 4.3 3088
2 10020 Seoul Gangdong-gu 37.530492 127.123837 27 32 0 1.54 14.55 5.4 1023
3 10030 Seoul Gangbuk-gu 37.639938 127.025508 14 21 0 0.67 19.49 8.5 628
4 10040 Seoul Gangseo-gu 37.551166 126.849506 36 56 1 1.17 14.39 5.7 1080

Web Scalping: Comparsion across the world

In [51]:
import requests
page = requests.get("https://www.worldometers.info/coronavirus")
page.status_code
Out[51]:
200
In [52]:
# page.content
soup = BeautifulSoup(page.content, 'lxml')
In [53]:
# print(soup.prettify())
In [54]:
covidTable = soup.find('table', attrs={'id': 'main_table_countries_today'})
# covidTable

Getting the data from the table

In [55]:
rows = covidTable.find_all("tr", attrs={"style": ""})
In [56]:
covidData = []
for i,data in enumerate(rows):
    if i == 0:
        
        covidData.append(data.text.strip().split("\n")[:13])
        
    else:
        covidData.append(data.text.strip().split("\n")[:12])
# covidData

Covert to data

In [57]:
covidTable = pd.DataFrame(covidData[1:], columns=covidData[0][:12])
covidTable = covidTable[~covidTable['#'].isin(['World', 'Total:'])]
covidTable = covidTable.drop('#', axis =1)
covidTable.head()
Out[57]:
Country,Other TotalCases NewCases TotalDeaths NewDeaths TotalRecovered NewRecovered ActiveCases Serious,Critical Tot Cases/1M pop Deaths/1M pop
1 USA 6,719,607 +11,149 198,654 +134 3,982,123 +7,174 2,538,830 14,113 20,276 599
2 India 4,878,042 +33,039 80,026 +272 3,809,549 +32,505 988,467 8,944 3,528 58
3 Brazil 4,335,066 +4,611 131,736 +73 3,573,958 629,372 8,318 20,365 619
4 Russia 1,068,320 +5,509 18,635 +57 878,700 +2,475 170,985 2,300 7,320 128
5 Peru 729,619 30,710 566,796 132,113 1,448 22,067 929

Worldwide Total Cases Chart

In [58]:
fig = px.bar(covidTable, y='TotalCases', x='Country,Other')
fig.show()

We will be using log because the data is highly skewed

In [59]:
covidTable['logTotalCases'] = np.log(covidTable['TotalCases'].str.replace(r'\D', '').astype(int))
covidTable = covidTable.sort_values(by=['logTotalCases'])
fig = px.bar(covidTable, y='logTotalCases', x='Country,Other')
fig.show()

Worldwide Total Death Chart

In [60]:
pd.set_option('display.max_rows', covidTable.shape[0]+1)
covidTable['TotalDeaths'].str.strip()
covidTable['TotalDeaths'].replace('', np.nan, inplace=True)
covidTable['TotalDeaths'].replace(' ', np.nan, inplace=True)
covidTable['TotalDeaths'] = covidTable['TotalDeaths'].fillna(0)

covidTable['logTotalDeath'] = np.log(covidTable['TotalDeaths'].str.replace(r'\D', '').astype(float))
covidTable = covidTable.sort_values(by=['logTotalDeath'])
fig = px.bar(covidTable, y='logTotalDeath', x='Country,Other')
fig.show()
In [61]:
fig = px.scatter(covidTable, x="logTotalDeath", y="logTotalCases", trendline="ols", title="Total Cases againist Total Death (Log)")
fig.show()

Plotting Data

Summarise Key findings from Exploratory Data Analysis

Overall Cases in Korea

  • On average, there are 100 COVID-19 cases in infected city.
  • The numbers are highly skewed by Nam-gu, a province of Daegu. Nam-gu has about 4500 cases.
  • Most of the clusters are in the province of Daegu, Gyeonggi-do and Seoul

Cases across various Korea province

  • Majority of the province has more individual cases of COVID-19 than group cases with the exception of Incheon and Ulsan
  • Seoul, Daegu, Gyeonggi-do and Gyeongsangbuk-do has the most number of individual cases among the Korea provinces, number of cases range from 150 to 200 with the exception of Daegu
  • In addition, the same group of provinces has the most number of grouped cases, number of cases range from 50-100 with the exception of Daegu

Source of infection

  • Sources of infection mainly come from Shincheonji Church, Itawon Clubs, Contact with Patients and Overseas inflow
  • Most of the infection sources across various province has less than 500 cases with the exception of Daegu.
  • Daegu has more than 4000 cases from Shincheonji Church and about 2000 cases from contact with patient and other cases each.

Patient Cases

  • There is a quick spike in daily COVID-19 cases from Feb to Mar before slowing dripping down to single digit in May. There are double digit COVID-19 cases in June

Recovery Speed

  • Age has an R-squared value of 0.04 for male and 0.03 for female against total treatment days.
  • At first graze, age seems to have little to no effect on recovery speed.
  • However when we used the mean recovery days for each age group, it was found that younger people recovered fast from COVID-19 as compared to older people.
  • Average days to recover for young people (<=30) is around 25 days and below. Middle-aged people(30-60) took about 25-30 days to recover. Older people (>=70) took about 32-36 days to recover
  • Interestingly, infected people in Daegu only conist of people from 50s to 70s. In addition, it took less than 15 days to recover. People in Jeollabuk-do take longer to recover from COVID-19 as compared to the rest of the provinces. Majority of the people took about 20-25 days to recover.

Fatality Speed

  • The average number of treatment day a patient had before he/she passed away is shorter than the average day it took for a patient recover.
  • In addition, younger patient has a lower number of treatment days as compared to older patients. This might indicate that younger patient seek treatment at the later stage of COVID-19.
  • Coincidentally, the number of treatment days increased as the age increases. The trend is similar to the recovery speed among during age group.
  • Daegu has the fastest fatality speed. An average patient is under treatment for less than 10 days before he/she passed away
  • Gyeonggi-do has an outliner. A 30s year old patient died in last than 10 days after seeking treatment.

Percentage of patients recovered, under treatment and deceased

  • Daegu has the highest fatality amount all the provinces. It has a fatality rate of more than 10%. In addition, Daejeon, Gyeongsangbuk-do and Ulsan are the only provinces which have fatailty.
  • Male has a higher fatality rate and a lowest survival rate as compared to female

COVID-19 Tranmission Amount

  • Majority of the cases only spread to 1 other person.
  • Cases #20000000205 is a super spread who spreaded to more than 50 people.
  • In addition, there are less than 10 cases where patient spread to more than 30 people or more.

Policy

  • There is a quick spike in daily COVID-19 cases from Feb to Mar before slowing dripping down to single digit in May. There are double digit COVID-19 cases in June
  • Technology related policy are introduced in Feb, especially thanks to policy like open data, people like us are able to do analysis on Korea COVID-19 cases.
  • After the spike in COVID-19 cases in March, education, social, health and immigration policy are introduced rapidly. Furthermore, the alert level was upgraded to Level 4 red.
  • After the introduction of the policies, the number of COVID-19 cases dropped drastically during April and May. Hence the policies are effective.
  • However, it seems to be lax in control of COVID-19 cases as the COVID-19 slowly creep up to double digit in May/June. This forced the govtnment to introduced administrative policies such as closure of clubs to control the situation.

Cases Over Time Analysis

  • As for 31st Jun 2020, 1.27 million of people has gone for testing, 1.24 million are tested negative, 12.8 thousand of confirmed cases, 11.5 thousand of released cases and 282 people has passed away.
  • Majority of the people are tested negative for covid-19 and less than 1% are tested positive for the virus.
  • Among those who are tested positive, almost 90% of the people has recovered and about 2% of the people has passed away

Cases Over Age & Time Analysis

  • Age group in the 20s 40s 50s 60s has higher infection cases as compared to other age group.
  • Furthermore, in the deceased section, most of the fatalities are in the age group of 60s 70s 80s,
  • This indicate that the risk of death from COVID-19 increases with age.

Cases Over Gender & Time Analysis

  • There are more female infected cases as compared to male cases.
  • The rate of increases of confirmed cases are about the same for both gender over the last 6 months
  • On the other hand, male are more prone to death to COVID-19 as compared to female

Cases Over Province & Time Analysis

  • The province of Daegu, Gyeongsangbuk-do, Incheon and Seoul has the most amount of covid-19 infection cases among the 17 provinces; with Daegu accounting for more than half of the cases.
  • The fatality count is relatively proportional to the infection count with the except of Seoul
  • Bother Seoul and Gyeonggi-do has about 900 infected cases. However, Gyeonggi-do has about 22 fatalities and Seoul has about 7 fatalities.

Weather

  • The weather is about the same across the provinces.
  • The temperature gradually increases from around 5 degree to 25 degree for the period of Jan 2020 to June 2020
  • The weather is relatively the same across all provinces, it has little to do with the reported cases in the short term

Number of nursing home vs Population Ratio

  • We observed an interesting fact regarding the elderly population ratio and the number of nursing home in the city.
  • The elderly population ratio and the number of nursing home has an inverse relationship
  • Perhaps the absolute number of the elder people is smaller where elderly population ratio are high which explains why there is nursing home in places with higher elderly population ratio.

Preparing the data for modelling

We will create a model which will predict whether a COVID-19 patient will survive given a certain set of conditions

  • Sort city & province into different risk level according to number of confirmed cases (Low, Medium, High)
  • Sort age into different age level according to age (kid, adult, elderly)
  • Gender into categorical group of 0 & 1
  • Number of treatment day
  • Status of released, deceased into different category (We will ignore cases who are under treatment)

Get the necessary columns

In [62]:
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]

## We removed isolated patient since it is not confirmed if they survived or not
patientModellingData = patientModellingData[patientModellingData['state'] != 'isolated']
nullList = ['sex','age','confirmed_date']
for item in nullList:
     patientModellingData = patientModellingData[~patientModellingData[item].isnull()]
        
display(patientModellingData.head())
sex age province confirmed_date released_date deceased_date state
0 male 50s Seoul 2020-01-23 2020-02-05 NaN released
1 male 30s Seoul 2020-01-30 2020-03-02 NaN released
2 male 50s Seoul 2020-01-30 2020-02-19 NaN released
3 male 20s Seoul 2020-01-30 2020-02-15 NaN released
4 female 20s Seoul 2020-01-31 2020-02-24 NaN released

Calculate number of days of treatment before dropped date column

In [63]:
cols = ['released_date', 'deceased_date', 'confirmed_date']
patientModellingData[cols] = patientModellingData[cols].apply(pd.to_datetime, errors='coerce', axis=1)

def calculate_number_of_treatment_days(row):
    if row["released_date"] is not pd.NaT:
        treatmentDays = pd.to_numeric((row['released_date'] - row['confirmed_date']).days)
        return(treatmentDays)
    elif row["deceased_date"] is not pd.NaT:
        treatmentDays = pd.to_numeric((row['deceased_date'] - row['confirmed_date']).days)
        return(treatmentDays)
    else:
        return None

patientModellingData['Treatment Days'] = patientModellingData.apply(calculate_number_of_treatment_days, axis=1)

patientModellingData = patientModellingData[~patientModellingData['Treatment Days'].isnull()]
patientModellingDataTreatment = patientModellingData[['sex','age','province','state','Treatment Days']]
display(patientModellingDataTreatment.head())
sex age province state Treatment Days
0 male 50s Seoul released 13.0
1 male 30s Seoul released 32.0
2 male 50s Seoul released 20.0
3 male 20s Seoul released 16.0
4 female 20s Seoul released 24.0

Convert state, gender and gender columns to categorical data

In [64]:
genders = {"male": 0, "female": 1}
patientModellingDataTreatment['sex'] = patientModellingDataTreatment['sex'].map(genders)
display(patientModellingDataTreatment.head())
sex age province state Treatment Days
0 0 50s Seoul released 13.0
1 0 30s Seoul released 32.0
2 0 50s Seoul released 20.0
3 0 20s Seoul released 16.0
4 1 20s Seoul released 24.0
In [65]:
state = {"released": 0, "deceased": 1}
patientModellingDataTreatment['state'] = patientModellingDataTreatment['state'].map(state)
display(patientModellingDataTreatment.head())
sex age province state Treatment Days
0 0 50s Seoul 0 13.0
1 0 30s Seoul 0 32.0
2 0 50s Seoul 0 20.0
3 0 20s Seoul 0 16.0
4 1 20s Seoul 0 24.0
In [66]:
age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
patientModellingDataTreatment['age'] = patientModellingDataTreatment['age'].map(age)
display(patientModellingDataTreatment.head())
sex age province state Treatment Days
0 0 5.0 Seoul 0 13.0
1 0 3.0 Seoul 0 32.0
2 0 5.0 Seoul 0 20.0
3 0 2.0 Seoul 0 16.0
4 1 2.0 Seoul 0 24.0

Get dummy for province

In [67]:
provinceDummy = pd.get_dummies(patientModellingDataTreatment['province'])
provinceDummy
Out[67]:
Chungcheongbuk-do Chungcheongnam-do Daegu Daejeon Gangwon-do Gwangju Gyeonggi-do Gyeongsangbuk-do Gyeongsangnam-do Incheon Jeju-do Jeollabuk-do Jeollanam-do Sejong Seoul Ulsan
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5156 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5157 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5158 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5159 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5160 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0

1635 rows × 16 columns

In [68]:
dataForModelling = patientModellingDataTreatment.join(provinceDummy)
dataForModelling = dataForModelling.drop('province',axis=1)
dataForModelling
Out[68]:
sex age state Treatment Days Chungcheongbuk-do Chungcheongnam-do Daegu Daejeon Gangwon-do Gwangju Gyeonggi-do Gyeongsangbuk-do Gyeongsangnam-do Incheon Jeju-do Jeollabuk-do Jeollanam-do Sejong Seoul Ulsan
0 0 5.0 0 13.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
1 0 3.0 0 32.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
2 0 5.0 0 20.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
3 0 2.0 0 16.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
4 1 2.0 0 24.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5156 0 3.0 0 46.0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5157 1 2.0 0 32.0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5158 1 1.0 0 12.0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5159 1 3.0 0 34.0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
5160 1 3.0 0 14.0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0

1635 rows × 20 columns

Correlation Matrix

In [69]:
plt.figure(figsize=(10,5))
sns.heatmap(dataForModelling.corr(), cmap="RdPu")

dataForModelling['age'] = pd.to_numeric(dataForModelling['age'])

dataForModellingForCleaning = dataForModelling.copy()

display(dataForModellingForCleaning.count())
print(np.any(np.isnan(dataForModellingForCleaning)))
print(np.all(np.isfinite(dataForModellingForCleaning)))

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(dataForModellingForCleaning)
display(dataForModellingForCleaning.count())
print(np.any(np.isnan(dataForModellingForCleaning)))
print(np.all(np.isfinite(dataForModellingForCleaning)))
sex                  1635
age                  1634
state                1635
Treatment Days       1635
Chungcheongbuk-do    1635
Chungcheongnam-do    1635
Daegu                1635
Daejeon              1635
Gangwon-do           1635
Gwangju              1635
Gyeonggi-do          1635
Gyeongsangbuk-do     1635
Gyeongsangnam-do     1635
Incheon              1635
Jeju-do              1635
Jeollabuk-do         1635
Jeollanam-do         1635
Sejong               1635
Seoul                1635
Ulsan                1635
dtype: int64
True
False
sex                  1634
age                  1634
state                1634
Treatment Days       1634
Chungcheongbuk-do    1634
Chungcheongnam-do    1634
Daegu                1634
Daejeon              1634
Gangwon-do           1634
Gwangju              1634
Gyeonggi-do          1634
Gyeongsangbuk-do     1634
Gyeongsangnam-do     1634
Incheon              1634
Jeju-do              1634
Jeollabuk-do         1634
Jeollanam-do         1634
Sejong               1634
Seoul                1634
Ulsan                1634
dtype: int64
False
True

Building Machine Learning Models Part 1

In [70]:
x = dataForModellingForCleaning.drop("state", axis=1)
y = dataForModellingForCleaning["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Stochastic Gradient Descent (SGD):

In [71]:
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)

Random Forest:

In [72]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

Logistic Regression:

In [73]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

Gaussian Naive Bayes:

In [74]:
gaussian = GaussianNB() 
gaussian.fit(X_train, Y_train)  
Y_pred = gaussian.predict(X_test)  
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

K Nearest Neighbor:

In [75]:
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_train, Y_train)  
Y_pred = knn.predict(X_test)  
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)

Perceptron:

In [76]:
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)

Linear Support Vector Machine:

In [77]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

Decision Tree

In [78]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

Getting the best model

In [79]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)
Out[79]:
Model
Score
99.63 Random Forest
99.63 Decision Tree
98.54 KNN
98.45 Support Vector Machines
98.17 Logistic Regression
97.90 Perceptron
97.81 Stochastic Gradient Decent
38.48 Naive Bayes

Decision Tree Diagram

In [80]:
feature_cols = x.columns

dot_data = StringIO()
export_graphviz(decision_tree, out_file = dot_data, 
                      feature_names = feature_cols,  
                     filled = True, rounded = True,  
                    special_characters = True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('covidTree.png')
Image(graph.create_png())

# Entropy 0 == no disorder // perfect knowledge, perfect classification
Out[80]:

What is the most important feature ?

In [81]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances1 = importances.sort_values('importance',ascending=False).set_index('feature')
importances1.head(15)
Out[81]:
importance
feature
Treatment Days 0.594
age 0.218
Daegu 0.104
Gyeongsangbuk-do 0.021
sex 0.019
Gangwon-do 0.014
Seoul 0.011
Ulsan 0.005
Chungcheongnam-do 0.004
Gyeongsangnam-do 0.003
Incheon 0.003
Chungcheongbuk-do 0.002
Gyeonggi-do 0.002
Daejeon 0.001
Gwangju 0.000
In [82]:
barData = importances.reset_index()
fig = px.bar(barData, x='feature', y='importance')
fig.show()

Confusion Matrix with Precision & Recall & F-Score

In [83]:
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result1_train = confusion_matrix(Y_train, predictions)
display(result1_train)

p1_train = precision_score(Y_train, predictions)
r1_train = recall_score(Y_train, predictions)
f1_train = f1_score(Y_train, predictions)
print("Precision:", p1_train)
print("Recall:", r1_train)
print("F-Score:", f1_train)
array([[1042,    7],
       [  17,   28]])
Precision: 0.8
Recall: 0.6222222222222222
F-Score: 0.7000000000000001
In [84]:
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result1_test = confusion_matrix(Y_test, predictions)
display(result1_test)

p1_test = precision_score(Y_test, predictions)
r1_test = recall_score(Y_test, predictions)
f1_test = f1_score(Y_test, predictions)
print("Precision:", p1_test)
print("Recall:", r1_test)
print("F-Score:", f1_test)
array([[513,   5],
       [  9,  13]])
Precision: 0.7222222222222222
Recall: 0.5909090909090909
F-Score: 0.65

Building Machine Learning Models Part 2

Lets observe what happened if we drop the treatment days column Will the model be more accurate?

In [85]:
x2 = dataForModellingForCleaning.drop(["state","Treatment Days"], axis=1)
y2 = dataForModellingForCleaning["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x2, y2, test_size=0.33, random_state=42)

Random Forest

In [86]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

Decision Tree

In [87]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

Getting the best model

In [88]:
results = pd.DataFrame({
    'Model': ['Random Forest','Decision Tree'],
    'Score': [acc_random_forest, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(2)
Out[88]:
Model
Score
97.07 Random Forest
97.07 Decision Tree

Decision Tree Diagram

In [89]:
feature_cols = x2.columns

dot_data = StringIO()
export_graphviz(decision_tree, out_file = dot_data, 
                      feature_names = feature_cols,  
                     filled = True, rounded = True,  
                    special_characters = True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('covidTree2.png')
Image(graph.create_png())
Out[89]:

Importance

In [90]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances2 = importances.sort_values('importance',ascending=False).set_index('feature')
importances2.head(15)
Out[90]:
importance
feature
age 0.440
Daegu 0.355
sex 0.072
Gangwon-do 0.043
Gyeongsangbuk-do 0.030
Ulsan 0.014
Chungcheongbuk-do 0.007
Chungcheongnam-do 0.006
Gyeonggi-do 0.006
Incheon 0.006
Seoul 0.006
Gyeongsangnam-do 0.005
Daejeon 0.002
Gwangju 0.002
Jeollanam-do 0.002
In [91]:
barData = importances.reset_index()
fig = px.bar(barData, x='feature', y='importance')
fig.show()

Confusion Matrix with Precision & Recall & F-Score

In [92]:
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result2_train = confusion_matrix(Y_train, predictions)
display(result2_train)

p2_train = precision_score(Y_train, predictions)
r2_train = recall_score(Y_train, predictions)
f2_train = f1_score(Y_train, predictions)
print("Precision:", p2_train)
print("Recall:", r2_train)
print("F-Score:", f2_train)
array([[1047,    2],
       [  36,    9]])
Precision: 0.8181818181818182
Recall: 0.2
F-Score: 0.32142857142857145
In [93]:
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result2_test = confusion_matrix(Y_test, predictions)
display(result2_test)

p2_test = precision_score(Y_test, predictions)
r2_test = recall_score(Y_test, predictions)
f2_test = f1_score(Y_test, predictions)
print("Precision:", p2_test)
print("Recall:", r2_test)
print("F-Score:", f2_test)
array([[506,  12],
       [ 13,   9]])
Precision: 0.42857142857142855
Recall: 0.4090909090909091
F-Score: 0.4186046511627907

Summary between the old and new model

The old model is better than the new model even though it has a lower precision rate.
The number of survivor in COVID-19 is much higher than the number of deceased.
Hence, it is much harder to predict who will pass away from the infection.

The new model is unable to predict whether a patient will pass away.
The number of false negatives outweigh the number of true negatives.

On the other hand, the old model is able to predict the deceased much accurately than the new model.
With that in mind, this also indicated that the number of days of treatment is vital.

Step to take to control the COVID-19 situation even better

Generally, those who passed away from the infection had lesser days of treatment. However, is it right for us to conclude that lesser amount of treatment days, the higher the fatality rate? Are the risk of dying higher during first few days/weeks of treatment? Or is there something more to it?

Abraham Wald and the Missing Bullet Holes

This situation is something similar to Abraham Wald and the Missing Bullet Holes. @penguinpress summed up the situation perfectly:

You don't want your planes to get shot down by enemy fighters, so you armor them. But armor makes the plane heavier, and heavier planes are less maneuverable and use more fuel. Armoring the planes too much is a problem; armoring the planes too little is a problem. Somewhere in between there's an optimum.

When American planes came back from engagements over Europe, they were covered in bullet holes. But the damage wasn't uniformly distributed across the aircraft. There were more bullet holes in the fuselage, not so many in the engines.

title

At first glance, it seems reasonable to focus the armor on the fuselage. However, the armor, said Wald, doesn't go where the bullet holes are. It goes where the bullet holes aren't: on the engines.

The missing bullet holes were on the missing planes. The reason planes were coming back with fewer hits to the engine is that planes that got hit in the engine weren't coming back.

If you go to the recovery room at the hospital, you'll see a lot more people with bullet holes in their legs than people with bullet holes in their chests. But that's not because people don't get shot in the chest; it's because the people who get shot in the chest don't recover.

The link to the excellent article written by @penguinpress is in the credit section below.

Survivorship Bias

In this case, our model only consists of people who have passed away after they have gone through treatment. We have excluded people who have died even before they have the opportunity to go through treatment! 

Even after we have excluded those who have not undergo treatment before they passed away, our model indicated that the number of treatment days is the strongest factor among gender, location and age. Treatment days has a whooping percentage of 60% in terms of importance next to age which has a importance of 20%.

Improving the situation

The chances of survival increases as the number of treatment days increases. Hence, the faster detection rate, the earlier the treatment, the more days and opportunities to treat the infection, the better the chances of survival. The Korea government put a very strong emphasis on COVID-19 detection programme and hence they managed to contain the COVID-19 situation.

Various cities among the world were also in lockdown to prevent the rapid spread of COVID-19. The healthcare system will be overwhelmed if COVID-19 were to spread widely which will cause the death rate to escalate as more people will have lesser opportunity to undergo treatment.

The older the person, the higher the chances of infection and death. Hence it is not advisable for elderly people to roam around unless it is necessary.

In terms of locations, the majority of the cases came from Daegu and the sources of infection is from places such as church and clubs. The higher the amount of contact, the greater the chances of infection. It is advisable for the authority to close such areas until a vaccine for COVID-19 is found. During May to June, there is an increase in COVID-19 cases due to complacency.

https://www.aa.com.tr/en/asia-pacific/s-korea-sees-mass-covid-19-cases-linked-to-night-clubs/1838031

https://www.channelnewsasia.com/news/asia/south-korea-covid-19-church-backlash-13092284

Bonus: Refine Dataset for Machine Learning

Will the model's accuracy improve if we include patient who died without any treatment?

In [94]:
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]
deadPatientModellingData = patientData[patientData['state'] == 'deceased']
totalDiedPatient = deadPatientModellingData.count()
display(totalDiedPatient)

display('***********')

pd.set_option('display.max_rows', deadPatientModellingData.shape[0]+1)
totalDiedPaitentWithoutTreatment = deadPatientModellingData[deadPatientModellingData['deceased_date'].isnull() == True].count()
display(totalDiedPaitentWithoutTreatment)
patient_id            78
sex                   75
age                   75
country               78
province              78
city                  59
infection_case        36
infected_by            3
contact_number         7
symptom_onset_date     6
confirmed_date        78
released_date          3
deceased_date         66
state                 78
dtype: int64
'***********'
patient_id            12
sex                    9
age                    9
country               12
province              12
city                  12
infection_case         8
infected_by            2
contact_number         4
symptom_onset_date     4
confirmed_date        12
released_date          1
deceased_date          0
state                 12
dtype: int64

Assuming patient who died without any released_date or deceased_date are those died without any treatment

There are about 12 patient who died without any treatment

In [95]:
str(12/78 * 100) + '%'
Out[95]:
'15.384615384615385%'

This accounts for 15% of the total death

Preparing the data for modelling

We will simply add another line of logic to return treatment days = 0
if the patient is deceased and both released and deceased date are empty

In [96]:
patientData = pd.read_csv('covid/patientinfo.csv')
patientModellingData = patientData[['sex','age','province','confirmed_date','released_date','deceased_date','state']]

## We removed isolated patient since it is not confirmed if they survived or not
patientModellingData = patientModellingData[patientModellingData['state'] != 'isolated']
nullList = ['sex','age','confirmed_date']
for item in nullList:
     patientModellingData = patientModellingData[~patientModellingData[item].isnull()]
        
display(patientModellingData.head())
sex age province confirmed_date released_date deceased_date state
0 male 50s Seoul 2020-01-23 2020-02-05 NaN released
1 male 30s Seoul 2020-01-30 2020-03-02 NaN released
2 male 50s Seoul 2020-01-30 2020-02-19 NaN released
3 male 20s Seoul 2020-01-30 2020-02-15 NaN released
4 female 20s Seoul 2020-01-31 2020-02-24 NaN released
In [97]:
cols = ['released_date', 'deceased_date', 'confirmed_date']
patientModellingData[cols] = patientModellingData[cols].apply(pd.to_datetime, errors='coerce', axis=1)

def calculate_number_of_treatment_days(row):
    if row["released_date"] is not pd.NaT:
        treatmentDays = pd.to_numeric((row['released_date'] - row['confirmed_date']).days)
        return(treatmentDays)
    elif row["deceased_date"] is not pd.NaT:
        treatmentDays = pd.to_numeric((row['deceased_date'] - row['confirmed_date']).days)
        return(treatmentDays)
    elif row["deceased_date"] is pd.NaT and row["released_date"] is pd.NaT:
        if row["state"] == 'deceased' and row["state"] != 'released':
            return 0
        else:
            return None
    else:
        return None

patientModellingData['Treatment Days'] = patientModellingData.apply(calculate_number_of_treatment_days, axis=1)

patientModellingData = patientModellingData[~patientModellingData['Treatment Days'].isnull()]
patientModellingDataTreatment = patientModellingData[['sex','age','province','state','Treatment Days']]
display(patientModellingDataTreatment.head())
sex age province state Treatment Days
0 male 50s Seoul released 13.0
1 male 30s Seoul released 32.0
2 male 50s Seoul released 20.0
3 male 20s Seoul released 16.0
4 female 20s Seoul released 24.0
In [98]:
genders = {"male": 0, "female": 1}
patientModellingDataTreatment['sex'] = patientModellingDataTreatment['sex'].map(genders)

state = {"released": 0, "deceased": 1}
patientModellingDataTreatment['state'] = patientModellingDataTreatment['state'].map(state)

age = {'0s':0, '10s':1, '20s':2, '30s':3, '40s':4, '50s':5, '60s':6, '70s':7, '80s':8, '90s':9}
patientModellingDataTreatment['age'] = patientModellingDataTreatment['age'].map(age)

display(patientModellingDataTreatment.head())
sex age province state Treatment Days
0 0 5.0 Seoul 0 13.0
1 0 3.0 Seoul 0 32.0
2 0 5.0 Seoul 0 20.0
3 0 2.0 Seoul 0 16.0
4 1 2.0 Seoul 0 24.0
In [99]:
dataForModelling = patientModellingDataTreatment.join(provinceDummy)
dataForModelling = dataForModelling.drop('province',axis=1)
dataForModelling
Out[99]:
sex age state Treatment Days Chungcheongbuk-do Chungcheongnam-do Daegu Daejeon Gangwon-do Gwangju Gyeonggi-do Gyeongsangbuk-do Gyeongsangnam-do Incheon Jeju-do Jeollabuk-do Jeollanam-do Sejong Seoul Ulsan
0 0 5.0 0 13.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 0 3.0 0 32.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
2 0 5.0 0 20.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
3 0 2.0 0 16.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
4 1 2.0 0 24.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5156 0 3.0 0 46.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
5157 1 2.0 0 32.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
5158 1 1.0 0 12.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
5159 1 3.0 0 34.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
5160 1 3.0 0 14.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0

1643 rows × 20 columns

Building Machine Learning Models Part 3

In [100]:
clean_dataset(dataForModelling)
display(dataForModelling.count())
print(np.any(np.isnan(dataForModelling)))
print(np.all(np.isfinite(dataForModelling)))

x3 = dataForModelling.drop(["state"], axis=1)
y3 = dataForModelling["state"]
X_train, X_test, Y_train, Y_test = train_test_split(x3, y3, test_size=0.33, random_state=42)

## Random_forest
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

## Decision Tree
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

results = pd.DataFrame({
    'Model': ['Random Forest','Decision Tree'],
    'Score': [acc_random_forest, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(2)
sex                  1634
age                  1634
state                1634
Treatment Days       1634
Chungcheongbuk-do    1634
Chungcheongnam-do    1634
Daegu                1634
Daejeon              1634
Gangwon-do           1634
Gwangju              1634
Gyeonggi-do          1634
Gyeongsangbuk-do     1634
Gyeongsangnam-do     1634
Incheon              1634
Jeju-do              1634
Jeollabuk-do         1634
Jeollanam-do         1634
Sejong               1634
Seoul                1634
Ulsan                1634
dtype: int64
False
True
Out[100]:
Model
Score
99.63 Random Forest
99.63 Decision Tree

Importances

In [101]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances3 = importances.sort_values('importance',ascending=False).set_index('feature')
importances3.head(15)
Out[101]:
importance
feature
Treatment Days 0.592
age 0.208
Daegu 0.121
sex 0.020
Gyeongsangbuk-do 0.017
Gangwon-do 0.011
Seoul 0.006
Chungcheongnam-do 0.006
Ulsan 0.005
Gyeongsangnam-do 0.004
Incheon 0.003
Chungcheongbuk-do 0.002
Gyeonggi-do 0.002
Daejeon 0.001
Gwangju 0.000

Confusion Matrix with Precision & Recall

In [102]:
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
result3_train = confusion_matrix(Y_train, predictions)
display(result3_train)

p3_train = precision_score(Y_train, predictions)
r3_train = recall_score(Y_train, predictions)
f3_train = f1_score(Y_train, predictions)
print("Precision:", p3_train)
print("Recall:", r3_train)
print("F-Score:",f3_train)
array([[1041,    8],
       [  15,   30]])
Precision: 0.7894736842105263
Recall: 0.6666666666666666
F-Score: 0.7228915662650601
In [103]:
predictions = cross_val_predict(random_forest, X_test, Y_test, cv=3)
result3_test = confusion_matrix(Y_test, predictions)
display(result3_test)

p3_test = precision_score(Y_test, predictions)
r3_test = recall_score(Y_test, predictions)
f3_test = f1_score(Y_test, predictions)
print("Precision:", p3_test)
print("Recall:", r3_test)
print("F-Score:", f3_test)
array([[513,   5],
       [  9,  13]])
Precision: 0.7222222222222222
Recall: 0.5909090909090909
F-Score: 0.65

Precision Recall Curve

Precision/recall tradeoff

In [104]:
# getting the probabilities of our predictions
y_scores = random_forest.predict_proba(X_train)
y_scores = y_scores[:,1]

precision, recall, threshold = precision_recall_curve(Y_train, y_scores)
def plot_precision_and_recall(precision, recall, threshold):
    plt.plot(threshold, precision[:-1], "r-", label="precision", linewidth=5)
    plt.plot(threshold, recall[:-1], "b", label="recall", linewidth=5)
    plt.xlabel("threshold", fontsize=10)
    plt.legend(loc="upper right", fontsize=10)
    plt.ylim([0, 1])

plt.figure(figsize=(10, 5))
plot_precision_and_recall(precision, recall, threshold)
plt.show()
In [105]:
def plot_precision_vs_recall(precision, recall):
    plt.plot(recall, precision, "g--", linewidth=2.5)
    plt.ylabel("recall", fontsize=19)
    plt.xlabel("precision", fontsize=19)
    plt.axis([0, 1.5, 0, 1.5])

plt.figure(figsize=(10, 5))
plot_precision_vs_recall(precision, recall)
plt.show()

ROC AUC Curve

True positive rate (also called recall) against the false positive rate (ratio of incorrectly classified negative instances)

In [106]:
# compute true positive rate and false positive rate
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, y_scores)
# plotting them against each other
def plot_roc_curve(false_positive_rate, true_positive_rate, label=None):
    plt.plot(false_positive_rate, true_positive_rate, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'r', linewidth=4)
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (FPR)', fontsize=16)
    plt.ylabel('True Positive Rate (TPR)', fontsize=16)

plt.figure(figsize=(10, 5))
plot_roc_curve(false_positive_rate, true_positive_rate)
plt.show()
In [107]:
r_a_score = roc_auc_score(Y_train, y_scores)
print("ROC-AUC-Score:", r_a_score)
ROC-AUC-Score: 0.9998199343289906

Final Summary

In [108]:
print('Model with treatment day :')
print('importance:', importances1.head(3))
display(result1_train)
display(result1_test)

print('******************************')
print('Model without treatment day: ')
print('importance:', importances2.head(3))
display(result2_train)
display(result2_test)

print('******************************')
print('Model with treatment day and those who died without treatment:')
print('importance:', importances3.head(3))
display(result3_train)
display(result3_test)

print('******************************')
print('Model with treatment day :')
print("Train Data Precision:", p1_train)
print("Train Data Recall:", r1_train)
print("Train Data F-Score:", f1_train)
print("Test Data Precision:", p1_test)
print("Test Data Recall:", r1_test)
print("Test Data F-Score:", f1_test)

print('******************************')
print('Model without treatment day: ')
print("Train Data Precision:", p2_train)
print("Train Data Recall:", r2_train)
print("Train Data F-Score:", f2_train)
print("Test Data Precision:", p2_test)
print("Test Data Recall:", r2_test)
print("Test Data F-Score:", f2_test)

print('******************************')
print('Model with treatment day and those who died without treatment:')
print("Train Data Precision:", p3_train)
print("Train Data Recall:", r3_train)
print("Train Data F-Score:", f3_train)
print("Test Data Precision:", p3_test)
print("Test Data Recall:", r3_test)
print("Test Data F-Score:", f3_test)
Model with treatment day :
importance:                 importance
feature                   
Treatment Days       0.594
age                  0.218
Daegu                0.104
array([[1042,    7],
       [  17,   28]])
array([[513,   5],
       [  9,  13]])
******************************
Model without treatment day: 
importance:          importance
feature            
age           0.440
Daegu         0.355
sex           0.072
array([[1047,    2],
       [  36,    9]])
array([[506,  12],
       [ 13,   9]])
******************************
Model with treatment day and those who died without treatment:
importance:                 importance
feature                   
Treatment Days       0.592
age                  0.208
Daegu                0.121
array([[1041,    8],
       [  15,   30]])
array([[513,   5],
       [  9,  13]])
******************************
Model with treatment day :
Train Data Precision: 0.8
Train Data Recall: 0.6222222222222222
Train Data F-Score: 0.7000000000000001
Test Data Precision: 0.7222222222222222
Test Data Recall: 0.5909090909090909
Test Data F-Score: 0.65
******************************
Model without treatment day: 
Train Data Precision: 0.8181818181818182
Train Data Recall: 0.2
Train Data F-Score: 0.32142857142857145
Test Data Precision: 0.42857142857142855
Test Data Recall: 0.4090909090909091
Test Data F-Score: 0.4186046511627907
******************************
Model with treatment day and those who died without treatment:
Train Data Precision: 0.7894736842105263
Train Data Recall: 0.6666666666666666
Train Data F-Score: 0.7228915662650601
Test Data Precision: 0.7222222222222222
Test Data Recall: 0.5909090909090909
Test Data F-Score: 0.65

Credit

Abraham Wald and the Missing Bullet Holes:
https://medium.com/@penguinpress/an-excerpt-from-how-not-to-be-wrong-by-jordan-ellenberg-664e708cfc3d\n

Decision Tree Diagram:
Ze Yu's sample project

In [ ]: